/**** This dofile is to randomize students
Inputs: 
*/

* generate value of random seeds 

preserve
drop _all
set obs 50
set seed 12345 
g value=int(10^6*runiform())
levelsof value
gl randomseed "`r(levels)'"
restore

********************************************************************************
*
*						GRADE 10 STUDENTS 
*
********************************************************************************
import excel using "$randomization/randomization_school_STEM_list.xlsx", sheet("Sheet1") firstrow clear

foreach v of varlist treat* {
	rename `v' treat
}
keep schoolname treat geoprisize
keep if treat==1 

tempfile STEM_schools
save `STEM_schools', replace
 
use "$clean/grade910_baseline_reachable_wgrade.dta", clear
drop if schoolname==""

* check respones if the school does not have 9 and 10 students
tab schoolname if level910==0
drop if level910==0

* total students taking the survey
bys schoolname: egen size=total(responseid~=.)
bys schoolname: egen size10=total(responseid~=. & grade==10)
g surveyrate = size/total910 
replace surveyrate=1 if surveyrate>1

* drop school with less than 50% survey rate
drop if surveyrate<0.5

 * wealth index
foreach i in x9a x9b x9c x9d x9e x9f x9g x9h x9i {
	g i_`i'=`i'
	recode i_`i' (99=.)
	recode i_`i' (2=0)
}
pca i_x9a-i_x9i
predict wealth_index, score

* keep only grade 10
keep if grade==10 

* keep only schools assigned as treatment 
merge m:1 schoolname using `STEM_schools'
keep if _merge==3
drop _merge 

* randomize students: the number of treat students depend on number of mentors available to each school and is proportional to the total number of grade 10 students in each school 

/* Number of mentors 
stem_rtc (10) 
stem_cnr (23)
stem_jnec (36) 
stem_cst (39) 
stem_gcit (44) 
stem_shrubtse (48) 
*/
* br schoolname 

bys schoolname: egen grade10size = total(grade==10)
foreach i in stem_cst stem_jnec stem_gcit stem_shrubtse stem_cnr stem_rtc {
	egen totalgrade10_`i' = total(grade==10 & `i'==1)
}

scalar nmentor_stem_cst = 39
scalar nmentor_stem_jnec = 36
scalar nmentor_stem_gcit = 44
scalar nmentor_stem_shrubtse = 48
scalar nmentor_stem_cnr = 23
scalar nmentor_stem_rtc = 10

g nmentor = .
foreach i in stem_cst stem_jnec stem_gcit stem_shrubtse stem_cnr stem_rtc {
replace nmentor = int(`=scalar(nmentor_`i')'*(grade10size/totalgrade10_`i')) if `i'==1 
drop totalgrade10_`i'
}

bys schoolname: g temp=nmentor if _n==1
foreach i in stem_cst stem_jnec stem_gcit stem_shrubtse stem_cnr stem_rtc {
	egen totmentors_`i'=total(temp) if `i'==1 
}
egen tmentor_region=rowmax(totmentors_stem_cst-totmentors_stem_rtc)
drop totmentors_* temp

* adjust the number of mentors based on geographical variable
tab tmentor_region

tab schoolname nmentor if stem_rtc==1 // extra 5
replace nmentor = nmentor + 1 if regexm(schoolname, "babesa|druk|khangkhu|khasadrapchu|pelkhil|yangchen")
replace nmentor = nmentor - 1 if regexm(schoolname, "lungtenzampa")

tab schoolname nmentor if stem_cnr==1 // extra 4 mentors
replace nmentor = nmentor + 1 if regexm(schoolname, "khuruthang|kabesa|dechentsemo|phobjikha")

tab schoolname nmentor if stem_jnec==1 // extra 3
replace nmentor = nmentor + 1 if regexm(schoolname, "garpawoong|minjiwoong|phuntshothang")

tab schoolname nmentor if stem_cst==1 // extra 2 
replace nmentor = nmentor + 1 if regexm(schoolname, "pakshika|chukha")

tab schoolname nmentor if stem_gcit==1 // extra 4
replace nmentor = nmentor + 1 if regexm(schoolname, "chaskhar|kidhey|kengkhar|tangmachu")

tab schoolname nmentor if stem_shrubtse==1 // extra 3
replace nmentor = nmentor + 1 if regexm(schoolname, "kunzangling|tsenkharla|gongthung")



* final check 
drop tmentor_region 
bys schoolname: g temp=nmentor if _n==1
foreach i in stem_cst stem_jnec stem_gcit stem_shrubtse stem_cnr stem_rtc {
	egen totmentors_`i'=total(temp) if `i'==1 
}
egen tmentor_region=rowmax(totmentors_stem_cst-totmentors_stem_rtc)
drop totmentors_* temp

tab tmentor_region // ok

preserve 
duplicates drop schoolname, force 
sort stem_cst stem_jnec stem_gcit stem_shrubtse stem_cnr stem_rtc
order schoolname nmentor grade10size stem_cst stem_jnec stem_gcit stem_shrubtse stem_cnr stem_rtc tmentor_region
keep schoolname nmentor grade10size stem_cst stem_jnec stem_gcit stem_shrubtse stem_cnr stem_rtc tmentor_region

export excel "$randomization/randomization_STEM_list.xlsx", sheetreplace firstrow(variables)

restore 


****** Baseline variables 
* age
g age = 2021-b_year

* academic performance: 
foreach i in c1a c1b c1c c1d c1e c1f {
	g i_`i' =`i'
	recode i_`i' (99=.)
}

* knowledge about Science stream 
foreach i in  e1c e4c {
	g i_`i' =`i'
	recode i_`i' (99=.)
}

* liking for science subjects 
foreach i in c2a c2d c2e c2f {
	g i_`i' =`i'
	recode i_`i' (99=.)
}

* ranking of STEM
g i_e14c=e14c=="1" // Science stream as first choice
 
* big 5
foreach i of varlist h1* {
	g i_`i' = `i'
	recode i_`i' (99=.)
}
g extroversion = (5-i_h1a) + i_h1c + i_h1i
g conscientiousness = (5-i_h1b) + i_h1d + i_h1o
g openness =  i_h1f + i_h1j + i_h1k
g agreeableness = (5-i_h1l) + i_h1m + i_h1n
g neuroticism = (5-i_h1e) + (5-i_h1g) + i_h1h

* having a sibling/relative in STEM 
g relatives_stem = x11a~=0 & x11a~=99 | x12a~=0 & x12a~=99

gl balancetest sex age i_c1a i_c1d i_c1e i_c1f i_c2a i_c2d i_c2e i_c2f c3c1 c3c2 c3c3 c3c4 c3c5 c3c6 c3c7 i_e1c i_e4c i_e14c extroversion conscientiousness openness agreeableness neuroticism relatives_stem

******************************* randomize students *****************************
cap drop treatstudent*
foreach j in 670927 {
set seed `j'  
bys schoolname: g rand_num=runiform()
bys schoolname: egen ordering=rank(rand_num)
bys schoolname: g treatstudent_`j' = ordering<=(nmentor*5)
cap drop rand_num ordering

cap g n=_n

* balance check: treat vs. control students
foreach variable of global balancetest {
qui ttest `variable', by(treatstudent_`j')
mat t`j' = r(t)
scalar nor`j'=(r(mu_1) - r(mu_2))/sqrt(0.5*(r(sd_1)^2 + r(sd_2)^2))
mat nor_diff`j' = (`=scalar(nor`j')')
mat R`j'`variable' = t`j', nor_diff`j'
}

mat R`j'=R`j'sex/R`j'age/R`j'i_c1a/R`j'i_c1d/R`j'i_c1e/R`j'i_c1f/R`j'i_c2a/R`j'i_c2d/R`j'i_c2e/R`j'i_c2f/R`j'c3c1/R`j'c3c2/R`j'c3c3/R`j'c3c4/R`j'c3c5/R`j'c3c6/R`j'c3c7/R`j'i_e1c/R`j'i_e4c/R`j'i_e14c/R`j'extroversion/R`j'conscientiousness/R`j'openness/R`j'agreeableness/R`j'neuroticism/R`j'relatives_stem
qui svmat double R`j'

rename R`j'1 t_`j'
rename R`j'2 NorDiff_`j'

* remove randomization results with	
	* statistically significant difference in means
	cap g tabs_`j'=abs(t_`j')
	cap qui summ tabs_`j'
	if r(max)>3.0 {
		cap drop *_`j'
	}
	cap drop tabs_`j'
}


* keep the result with minimized maximum abs(normalized difference)
preserve 
keep NorDiff* 
foreach v of varlist NorDiff* {
	cap g abs`v'=abs(`v')
	cap egen max_`v' = max(abs`v')
	drop abs`v'
}
egen minmax = rowmin(max*)

foreach v of varlist NorDiff* {
if max_`v' ~= minmax {
	cap drop `v' max_`v'
}
}
cap drop minmax
foreach var of varlist NorDiff* {
local seed = substr("`var'", 9, .)
}

local j = `seed'
dis "`j'"
restore 

drop treat* t_* NorDiff_* 

* export results with the chosen seed
set seed `j'

bys schoolname: g rand_num=runiform()
bys schoolname: egen ordering=rank(rand_num)
bys schoolname: g treatstudent_`j' = ordering<=(nmentor*5)
cap drop rand_num ordering

****** balance check: treat vs. control students
gl balancetest sex age i_c1a i_c1d i_c1e i_c1f i_c2a i_c2d i_c2e i_c2f c3c1 c3c2 c3c3 c3c4 c3c5 c3c6 c3c7 i_e1c i_e4c i_e14c extroversion conscientiousness openness agreeableness neuroticism wealth_index relatives_stem

foreach variable of global balancetest {
qui ttest `variable', by(treatstudent_`j')
mat Cn`j' = r(N_1)
mat Tn`j' = r(N_2)
mat C`j' = r(mu_1)
mat T`j' = r(mu_2)
mat diff`j' = r(mu_1) - r(mu_2)
mat diff`j' = r(mu_1) - r(mu_2)
mat t`j' = r(t)
scalar nor`j'=(r(mu_1) - r(mu_2))/sqrt(0.5*(r(sd_1)^2 + r(sd_2)^2))
mat nor_diff`j' = (`=scalar(nor`j')')

mat R`j'`variable' = Cn`j',C`j',Tn`j', T`j',diff`j',t`j', nor_diff`j'
}

mat R`j'=R`j'sex/R`j'age/R`j'i_c1a/R`j'i_c1d/R`j'i_c1e/R`j'i_c1f/R`j'i_c2a/R`j'i_c2d/R`j'i_c2e/R`j'i_c2f/R`j'c3c1/R`j'c3c2/R`j'c3c3/R`j'c3c4/R`j'c3c5/R`j'c3c6/R`j'c3c7/R`j'i_e1c/R`j'i_e4c/R`j'i_e14c/R`j'extroversion/R`j'conscientiousness/R`j'openness/R`j'agreeableness/R`j'neuroticism/R`j'wealth_index/R`j'relatives_stem

qui svmat double R`j'
rename R`j'1 CN_`j'
rename R`j'2 Cmean_`j'
rename R`j'3 TN_`j'
rename R`j'4 Tmean_`j'
rename R`j'5 Diff_`j'
rename R`j'6 t_`j'
rename R`j'7 NorDiff_`j'

cap format C* T* Di* t_* NorDiff* %10.3f
cap format CN* TN* %10.0f

preserve
sort n
cap drop schoolname
cap drop responseid-relatives_stem
cap drop variable
g variable=""
replace variable="age" if n==1
replace variable="male" if n==2
replace variable="performance Math" if n==3
replace variable="performance Physics" if n==4
replace variable="performance Chemistry" if n==5
replace variable="performance Biology" if n==6
replace variable="liking Math" if n==7
replace variable="liking Physics" if n==8
replace variable="liking Chemistry" if n==9
replace variable="liking Biology" if n==10
replace variable="consult w parents about educareer" if n==11
replace variable="consult w siblings about educareer" if n==12
replace variable="consult w relatives about educareer" if n==13
replace variable="consult w friends about educareer" if n==14
replace variable="consult w neighbors about educareer" if n==15
replace variable="consult w teachers about educareer" if n==16
replace variable="consult w others about educareer" if n==17
replace variable="knowledge about STEM admission" if n==18
replace variable="knowledge about STEM career" if n==19
replace variable="ranking of Science stream" if n==20
replace variable="Big5 extroversion" if n==21
replace variable="Big5 conscientiousness" if n==22
replace variable="Big5 openness" if n==23
replace variable="Big5 agreeableness" if n==24
replace variable="Big5 neuroticism" if n==25
replace variable="wealth index" if n==26
replace variable="having relatives studying/having STEM degrees" if n==27
keep if n<=28
replace n=0 if n==28
sort n
order variable
replace variable="randomization seed" if n==0
cap drop n 
cap drop treat*

export delimited using "$randomization/randomization_student_STEM_mainlist_balance.csv", nolabel datafmt replace
restore 

foreach v of varlist treatstudent* {
	rename `v' treatstudent
}

preserve 
keep if treatstudent==1
sort schoolname name sex studentid 
cap keep schoolname studentid name sex b_*  phone
order schoolname name studentid sex b_* phone
export excel using "$randomization/randomization_student_STEM_mainlist.xlsx", replace firstrow(variables)
restore 

save "$randomization/student_STEM_treatschools.dta", replace

/********************************************************************************
******* waitlist: randomly chosen from control students: one per mentor ********
********************************************************************************
keep if treatstudent==0

foreach j of global randomseed {
	set seed `j'
	cap drop waitlist 
	bys schoolname: g rand_num=runiform()
	bys schoolname: egen ordering=rank(rand_num)
	bys schoolname: g waitlist`j' = ordering<=(nmentor)
	cap drop rand_num ordering


cap g n=_n

* balance check: waitlist and non-waitlist
foreach variable of global balancetest {
qui ttest `variable', by(waitlist`j')
mat t`j' = r(t)
scalar nor`j'=(r(mu_1) - r(mu_2))/sqrt(0.5*(r(sd_1)^2 + r(sd_2)^2))
mat nor_diff`j' = (`=scalar(nor`j')')
mat R`j'`variable' = t`j', nor_diff`j'
}

mat R`j'=R`j'sex/R`j'age/R`j'i_c1a/R`j'i_c1b/R`j'i_c1c/R`j'i_c1d/R`j'i_c1e/R`j'i_c1f/R`j'i_c2a/R`j'i_c2d/R`j'i_c2e/R`j'i_c2f/R`j'c3c1/R`j'c3c2/R`j'c3c3/R`j'c3c4/R`j'c3c5/R`j'c3c6/R`j'c3c7/R`j'i_e1c/R`j'i_e4c/R`j'i_e14c/R`j'extroversion/R`j'conscientiousness/R`j'openness/R`j'agreeableness/R`j'neuroticism/R`j'wealth_index/R`j'relatives_stem
qui svmat double R`j'

rename R`j'1 t_`j'
rename R`j'2 NorDiff_`j'

* remove randomization results with	
	* statistically significant difference in means
	cap g tabs_`j'=abs(t_`j')
	cap qui summ tabs_`j'
	if r(max)>1.96 {
		cap drop *_`j'
	}
	cap drop tabs_`j'
}

* keep the result with minimized maximum abs(normalized difference)
preserve 
keep NorDiff* 
foreach v of varlist NorDiff* {
	cap g abs`v'=abs(`v')
	cap egen max_`v' = max(abs`v')
	drop abs`v'
}
egen minmax = rowmin(max*)

foreach v of varlist NorDiff* {
if max_`v' ~= minmax {
	cap drop `v' max_`v'
}
}
cap drop minmax
foreach var of varlist NorDiff* {
local seed = substr("`var'", 9, .)
}

local j = `seed'
restore 

drop waitlist* t_* NorDiff_* 

* Compare means an compute normalized difference
set seed `j'

bys schoolname: g rand_num=runiform()
bys schoolname: egen ordering=rank(rand_num)
bys schoolname: g waitlist`j' = ordering<=(nmentor*5)
cap drop rand_num ordering

****** balance check: treat vs. control students
foreach variable of global balancetest {
qui ttest `variable', by(waitlist`j')
mat Cn`j' = r(N_1)
mat Tn`j' = r(N_2)
mat C`j' = r(mu_1)
mat T`j' = r(mu_2)
mat diff`j' = r(mu_1) - r(mu_2)
mat diff`j' = r(mu_1) - r(mu_2)
mat t`j' = r(t)
scalar nor`j'=(r(mu_1) - r(mu_2))/sqrt(0.5*(r(sd_1)^2 + r(sd_2)^2))
mat nor_diff`j' = (`=scalar(nor`j')')

mat R`j'`variable' = Cn`j',C`j',Tn`j', T`j',diff`j',t`j', nor_diff`j'
}

mat R`j'=R`j'sex/R`j'age/R`j'i_c1a/R`j'i_c1b/R`j'i_c1c/R`j'i_c1d/R`j'i_c1e/R`j'i_c1f/R`j'i_c2a/R`j'i_c2d/R`j'i_c2e/R`j'i_c2f/R`j'c3c1/R`j'c3c2/R`j'c3c3/R`j'c3c4/R`j'c3c5/R`j'c3c6/R`j'c3c7/R`j'i_e1c/R`j'i_e4c/R`j'i_e14c/R`j'extroversion/R`j'conscientiousness/R`j'openness/R`j'agreeableness/R`j'neuroticism/R`j'wealth_index/R`j'relatives_stem

qui svmat double R`j'
rename R`j'1 CN_`j'
rename R`j'2 Cmean_`j'
rename R`j'3 TN_`j'
rename R`j'4 Tmean_`j'
rename R`j'5 Diff_`j'
rename R`j'6 t_`j'
rename R`j'7 NorDiff_`j'

cap format C* T* Di* t_* NorDiff* %10.3f
cap format CN* TN* %10.0f

preserve
sort n
cap drop schoolname
cap drop responseid-relatives_tti
cap drop variable
g variable=""
replace variable="age" if n==1
replace variable="male" if n==2
replace variable="performance Math" if n==3
replace variable="performance Dzongkha" if n==4
replace variable="performance English" if n==5
replace variable="performance Physics" if n==6
replace variable="performance Chemistry" if n==7
replace variable="performance Biology" if n==8
replace variable="liking Math" if n==9
replace variable="liking Physics" if n==10
replace variable="liking Chemistry" if n==11
replace variable="liking Biology" if n==12
replace variable="consult w parents about educareer" if n==13
replace variable="consult w siblings about educareer" if n==14
replace variable="consult w relatives about educareer" if n==15
replace variable="consult w friends about educareer" if n==16
replace variable="consult w neighbors about educareer" if n==17
replace variable="consult w teachers about educareer" if n==18
replace variable="consult w others about educareer" if n==19
replace variable="knowledge about STEM admission" if n==20
replace variable="knowledge about STEM career" if n==21
replace variable="ranking of Science stream" if n==22
replace variable="Big5 extroversion" if n==23
replace variable="Big5 conscientiousness" if n==24
replace variable="Big5 openness" if n==25
replace variable="Big5 agreeableness" if n==26
replace variable="Big5 neuroticism" if n==27
replace variable="wealth index" if n==28
replace variable="having relatives studying/having STEM degrees" if n==29
keep if n<=30
replace n=0 if n==30
sort n
order variable
replace variable="randomization seed" if n==0
cap drop n 
cap drop treat*

export delimited using "$randomization/randomization_student_STEM_waitlist_balance.csv", nolabel datafmt replace
restore 

foreach v of varlist waitlist* {
	rename `v' waitlist
}

preserve 
keep if waitlist==1
sort schoolname name sex studentid 
cap keep schoolname studentid name sex b_* phone
order schoolname name studentid sex b_* stream phone
export excel using "$randomization/randomization_student_STEM_waitlist.xlsx", replace firstrow(variables)
restore 
*/
